In [1]:
import pandas as pd
import numpy as num
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as pit
import warnings
warnings.filterwarnings('ignore')
Load the Dataset¶
In [2]:
cd = pd.read_csv("D:/CRICKET DATASET/Crickets data.csv")
cd
Out[2]:
| match_id | inning | batting_team | bowling_team | over | ball | batsman | non_striker | bowler | is_super_over | ... | bye_runs | legbye_runs | noball_runs | penalty_runs | batsman_runs | extra_runs | total_runs | player_dismissed | dismissal_kind | fielder | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 1 | Sunrisers Hyderabad | Royal Challengers Bangalore | 1 | 1 | DA Warner | S Dhawan | TS Mills | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | NaN | NaN | NaN |
| 1 | 1 | 1 | Sunrisers Hyderabad | Royal Challengers Bangalore | 1 | 2 | DA Warner | S Dhawan | TS Mills | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | NaN | NaN | NaN |
| 2 | 1 | 1 | Sunrisers Hyderabad | Royal Challengers Bangalore | 1 | 3 | DA Warner | S Dhawan | TS Mills | 0 | ... | 0 | 0 | 0 | 0 | 4 | 0 | 4 | NaN | NaN | NaN |
| 3 | 1 | 1 | Sunrisers Hyderabad | Royal Challengers Bangalore | 1 | 4 | DA Warner | S Dhawan | TS Mills | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | NaN | NaN | NaN |
| 4 | 1 | 1 | Sunrisers Hyderabad | Royal Challengers Bangalore | 1 | 5 | DA Warner | S Dhawan | TS Mills | 0 | ... | 0 | 0 | 0 | 0 | 0 | 2 | 2 | NaN | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 150455 | 636 | 2 | Royal Challengers Bangalore | Sunrisers Hyderabad | 20 | 2 | Sachin Baby | CJ Jordan | B Kumar | 0 | ... | 0 | 0 | 0 | 0 | 2 | 0 | 2 | NaN | NaN | NaN |
| 150456 | 636 | 2 | Royal Challengers Bangalore | Sunrisers Hyderabad | 20 | 3 | Sachin Baby | CJ Jordan | B Kumar | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | CJ Jordan | run out | NV Ojha |
| 150457 | 636 | 2 | Royal Challengers Bangalore | Sunrisers Hyderabad | 20 | 4 | Iqbal Abdulla | Sachin Baby | B Kumar | 0 | ... | 0 | 1 | 0 | 0 | 0 | 1 | 1 | NaN | NaN | NaN |
| 150458 | 636 | 2 | Royal Challengers Bangalore | Sunrisers Hyderabad | 20 | 5 | Sachin Baby | Iqbal Abdulla | B Kumar | 0 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 1 | NaN | NaN | NaN |
| 150459 | 636 | 2 | Royal Challengers Bangalore | Sunrisers Hyderabad | 20 | 6 | Iqbal Abdulla | Sachin Baby | B Kumar | 0 | ... | 0 | 0 | 0 | 0 | 4 | 0 | 4 | NaN | NaN | NaN |
150460 rows × 21 columns
In [3]:
dd = pd.read_csv("D:/kk/delivers match.csv")
dd
Out[3]:
| id | season | city | date | team1 | team2 | toss_winner | toss_decision | result | dl_applied | winner | win_by_runs | win_by_wickets | player_of_match | venue | umpire1 | umpire2 | umpire3 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 2017 | Hyderabad | 2017-04-05 | Sunrisers Hyderabad | Royal Challengers Bangalore | Royal Challengers Bangalore | field | normal | 0 | Sunrisers Hyderabad | 35 | 0 | Yuvraj Singh | Rajiv Gandhi International Stadium, Uppal | AY Dandekar | NJ Llong | NaN |
| 1 | 2 | 2017 | Pune | 2017-04-06 | Mumbai Indians | Rising Pune Supergiant | Rising Pune Supergiant | field | normal | 0 | Rising Pune Supergiant | 0 | 7 | SPD Smith | Maharashtra Cricket Association Stadium | A Nand Kishore | S Ravi | NaN |
| 2 | 3 | 2017 | Rajkot | 2017-04-07 | Gujarat Lions | Kolkata Knight Riders | Kolkata Knight Riders | field | normal | 0 | Kolkata Knight Riders | 0 | 10 | CA Lynn | Saurashtra Cricket Association Stadium | Nitin Menon | CK Nandan | NaN |
| 3 | 4 | 2017 | Indore | 2017-04-08 | Rising Pune Supergiant | Kings XI Punjab | Kings XI Punjab | field | normal | 0 | Kings XI Punjab | 0 | 6 | GJ Maxwell | Holkar Cricket Stadium | AK Chaudhary | C Shamshuddin | NaN |
| 4 | 5 | 2017 | Bangalore | 2017-04-08 | Royal Challengers Bangalore | Delhi Daredevils | Royal Challengers Bangalore | bat | normal | 0 | Royal Challengers Bangalore | 15 | 0 | KM Jadhav | M Chinnaswamy Stadium | NaN | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 631 | 632 | 2016 | Raipur | 2016-05-22 | Delhi Daredevils | Royal Challengers Bangalore | Royal Challengers Bangalore | field | normal | 0 | Royal Challengers Bangalore | 0 | 6 | V Kohli | Shaheed Veer Narayan Singh International Stadium | A Nand Kishore | BNJ Oxenford | NaN |
| 632 | 633 | 2016 | Bangalore | 2016-05-24 | Gujarat Lions | Royal Challengers Bangalore | Royal Challengers Bangalore | field | normal | 0 | Royal Challengers Bangalore | 0 | 4 | AB de Villiers | M Chinnaswamy Stadium | AK Chaudhary | HDPK Dharmasena | NaN |
| 633 | 634 | 2016 | Delhi | 2016-05-25 | Sunrisers Hyderabad | Kolkata Knight Riders | Kolkata Knight Riders | field | normal | 0 | Sunrisers Hyderabad | 22 | 0 | MC Henriques | Feroz Shah Kotla | M Erasmus | C Shamshuddin | NaN |
| 634 | 635 | 2016 | Delhi | 2016-05-27 | Gujarat Lions | Sunrisers Hyderabad | Sunrisers Hyderabad | field | normal | 0 | Sunrisers Hyderabad | 0 | 4 | DA Warner | Feroz Shah Kotla | M Erasmus | CK Nandan | NaN |
| 635 | 636 | 2016 | Bangalore | 2016-05-29 | Sunrisers Hyderabad | Royal Challengers Bangalore | Sunrisers Hyderabad | bat | normal | 0 | Sunrisers Hyderabad | 8 | 0 | BCJ Cutting | M Chinnaswamy Stadium | HDPK Dharmasena | BNJ Oxenford | NaN |
636 rows × 18 columns
Inspect the dataset¶
In [222]:
cd.shape
Out[222]:
(150460, 21)
In [223]:
dd.shape
Out[223]:
(636, 18)
In [224]:
cd = cd.drop(['player_dismissed','fielder','dismissal_kind'],axis=1)
cd
Out[224]:
| match_id | inning | batting_team | bowling_team | over | ball | batsman | non_striker | bowler | is_super_over | wide_runs | bye_runs | legbye_runs | noball_runs | penalty_runs | batsman_runs | extra_runs | total_runs | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 1 | Sunrisers Hyderabad | Royal Challengers Bangalore | 1 | 1 | DA Warner | S Dhawan | TS Mills | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | 1 | 1 | Sunrisers Hyderabad | Royal Challengers Bangalore | 1 | 2 | DA Warner | S Dhawan | TS Mills | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2 | 1 | 1 | Sunrisers Hyderabad | Royal Challengers Bangalore | 1 | 3 | DA Warner | S Dhawan | TS Mills | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0 | 4 |
| 3 | 1 | 1 | Sunrisers Hyderabad | Royal Challengers Bangalore | 1 | 4 | DA Warner | S Dhawan | TS Mills | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | 1 | 1 | Sunrisers Hyderabad | Royal Challengers Bangalore | 1 | 5 | DA Warner | S Dhawan | TS Mills | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 2 | 2 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 150455 | 636 | 2 | Royal Challengers Bangalore | Sunrisers Hyderabad | 20 | 2 | Sachin Baby | CJ Jordan | B Kumar | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 2 |
| 150456 | 636 | 2 | Royal Challengers Bangalore | Sunrisers Hyderabad | 20 | 3 | Sachin Baby | CJ Jordan | B Kumar | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 150457 | 636 | 2 | Royal Challengers Bangalore | Sunrisers Hyderabad | 20 | 4 | Iqbal Abdulla | Sachin Baby | B Kumar | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 1 |
| 150458 | 636 | 2 | Royal Challengers Bangalore | Sunrisers Hyderabad | 20 | 5 | Sachin Baby | Iqbal Abdulla | B Kumar | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 |
| 150459 | 636 | 2 | Royal Challengers Bangalore | Sunrisers Hyderabad | 20 | 6 | Iqbal Abdulla | Sachin Baby | B Kumar | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0 | 4 |
150460 rows × 18 columns
In [225]:
cd.shape
Out[225]:
(150460, 18)
In [226]:
dd = dd.drop(['id','dl_applied','date','venue','umpire1','umpire2','umpire3'],axis=1)
dd
Out[226]:
| season | city | team1 | team2 | toss_winner | toss_decision | result | winner | win_by_runs | win_by_wickets | player_of_match | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2017 | Hyderabad | Sunrisers Hyderabad | Royal Challengers Bangalore | Royal Challengers Bangalore | field | normal | Sunrisers Hyderabad | 35 | 0 | Yuvraj Singh |
| 1 | 2017 | Pune | Mumbai Indians | Rising Pune Supergiant | Rising Pune Supergiant | field | normal | Rising Pune Supergiant | 0 | 7 | SPD Smith |
| 2 | 2017 | Rajkot | Gujarat Lions | Kolkata Knight Riders | Kolkata Knight Riders | field | normal | Kolkata Knight Riders | 0 | 10 | CA Lynn |
| 3 | 2017 | Indore | Rising Pune Supergiant | Kings XI Punjab | Kings XI Punjab | field | normal | Kings XI Punjab | 0 | 6 | GJ Maxwell |
| 4 | 2017 | Bangalore | Royal Challengers Bangalore | Delhi Daredevils | Royal Challengers Bangalore | bat | normal | Royal Challengers Bangalore | 15 | 0 | KM Jadhav |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 631 | 2016 | Raipur | Delhi Daredevils | Royal Challengers Bangalore | Royal Challengers Bangalore | field | normal | Royal Challengers Bangalore | 0 | 6 | V Kohli |
| 632 | 2016 | Bangalore | Gujarat Lions | Royal Challengers Bangalore | Royal Challengers Bangalore | field | normal | Royal Challengers Bangalore | 0 | 4 | AB de Villiers |
| 633 | 2016 | Delhi | Sunrisers Hyderabad | Kolkata Knight Riders | Kolkata Knight Riders | field | normal | Sunrisers Hyderabad | 22 | 0 | MC Henriques |
| 634 | 2016 | Delhi | Gujarat Lions | Sunrisers Hyderabad | Sunrisers Hyderabad | field | normal | Sunrisers Hyderabad | 0 | 4 | DA Warner |
| 635 | 2016 | Bangalore | Sunrisers Hyderabad | Royal Challengers Bangalore | Sunrisers Hyderabad | bat | normal | Sunrisers Hyderabad | 8 | 0 | BCJ Cutting |
636 rows × 11 columns
In [227]:
dd.shape
Out[227]:
(636, 11)
Get the summary of your dataset¶
In [228]:
cd.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 150460 entries, 0 to 150459 Data columns (total 18 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 match_id 150460 non-null int64 1 inning 150460 non-null int64 2 batting_team 150460 non-null object 3 bowling_team 150460 non-null object 4 over 150460 non-null int64 5 ball 150460 non-null int64 6 batsman 150460 non-null object 7 non_striker 150460 non-null object 8 bowler 150460 non-null object 9 is_super_over 150460 non-null int64 10 wide_runs 150460 non-null int64 11 bye_runs 150460 non-null int64 12 legbye_runs 150460 non-null int64 13 noball_runs 150460 non-null int64 14 penalty_runs 150460 non-null int64 15 batsman_runs 150460 non-null int64 16 extra_runs 150460 non-null int64 17 total_runs 150460 non-null int64 dtypes: int64(13), object(5) memory usage: 20.7+ MB
Identify missing values¶¶
In [229]:
cd.isnull().sum()
Out[229]:
match_id 0 inning 0 batting_team 0 bowling_team 0 over 0 ball 0 batsman 0 non_striker 0 bowler 0 is_super_over 0 wide_runs 0 bye_runs 0 legbye_runs 0 noball_runs 0 penalty_runs 0 batsman_runs 0 extra_runs 0 total_runs 0 dtype: int64
In [230]:
dd.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 636 entries, 0 to 635 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 season 636 non-null int64 1 city 629 non-null object 2 team1 636 non-null object 3 team2 636 non-null object 4 toss_winner 636 non-null object 5 toss_decision 636 non-null object 6 result 636 non-null object 7 winner 633 non-null object 8 win_by_runs 636 non-null int64 9 win_by_wickets 636 non-null int64 10 player_of_match 633 non-null object dtypes: int64(3), object(8) memory usage: 54.8+ KB
Identify missing values¶¶
In [231]:
dd.isnull().sum()
Out[231]:
season 0 city 7 team1 0 team2 0 toss_winner 0 toss_decision 0 result 0 winner 3 win_by_runs 0 win_by_wickets 0 player_of_match 3 dtype: int64
List all column names¶
In [232]:
cd.columns
Out[232]:
Index(['match_id', 'inning', 'batting_team', 'bowling_team', 'over', 'ball',
'batsman', 'non_striker', 'bowler', 'is_super_over', 'wide_runs',
'bye_runs', 'legbye_runs', 'noball_runs', 'penalty_runs',
'batsman_runs', 'extra_runs', 'total_runs'],
dtype='object')
In [233]:
dd.columns
Out[233]:
Index(['season', 'city', 'team1', 'team2', 'toss_winner', 'toss_decision',
'result', 'winner', 'win_by_runs', 'win_by_wickets', 'player_of_match'],
dtype='object')
In [234]:
cd['batting_team'] = cd['batting_team'].map({'Chennai Super Kings':'CSK',
'Deccan Chargers':'DC',
'Delhi Daredevils':'DD',
'Gujarat Lions':'GL',
'Kings XI Punjab':'KXIP',
'Kochi Tuskers Kerala':'KTK',
'Kolkata Knight Riders':'KKR',
'Mumbai Indians':'MI',
'Pune Warriors':'PW',
'Rajasthan Royals':'RR',
'Rising Pune Supergiant':'RPS',
'Rising Pune Supergiants':'RPSS',
'Royal Challengers Bangalore':'RCB',
'Sunrisers Hyderabad':'SH'})
cd['batting_team']
Out[234]:
0 SH
1 SH
2 SH
3 SH
4 SH
...
150455 RCB
150456 RCB
150457 RCB
150458 RCB
150459 RCB
Name: batting_team, Length: 150460, dtype: object
In [235]:
cd['bowling_team'] = cd['bowling_team'].map({'Chennai Super Kings':'CSK',
'Deccan Chargers':'DC',
'Delhi Daredevils':'DD',
'Gujarat Lions':'GL',
'Kings XI Punjab':'KXIP',
'Kochi Tuskers Kerala':'KTK',
'Kolkata Knight Riders':'KKR',
'Mumbai Indians':'MI',
'Pune Warriors':'PW',
'Rajasthan Royals':'RR',
'Rising Pune Supergiant':'RPS',
'Rising Pune Supergiants':'RPSS',
'Royal Challengers Bangalore':'RCB',
'Sunrisers Hyderabad':'SH'})
cd['bowling_team']
Out[235]:
0 RCB
1 RCB
2 RCB
3 RCB
4 RCB
...
150455 SH
150456 SH
150457 SH
150458 SH
150459 SH
Name: bowling_team, Length: 150460, dtype: object
In [236]:
cd.head()
Out[236]:
| match_id | inning | batting_team | bowling_team | over | ball | batsman | non_striker | bowler | is_super_over | wide_runs | bye_runs | legbye_runs | noball_runs | penalty_runs | batsman_runs | extra_runs | total_runs | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 1 | SH | RCB | 1 | 1 | DA Warner | S Dhawan | TS Mills | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | 1 | 1 | SH | RCB | 1 | 2 | DA Warner | S Dhawan | TS Mills | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2 | 1 | 1 | SH | RCB | 1 | 3 | DA Warner | S Dhawan | TS Mills | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0 | 4 |
| 3 | 1 | 1 | SH | RCB | 1 | 4 | DA Warner | S Dhawan | TS Mills | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | 1 | 1 | SH | RCB | 1 | 5 | DA Warner | S Dhawan | TS Mills | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 2 | 2 |
In [237]:
cd.tail()
Out[237]:
| match_id | inning | batting_team | bowling_team | over | ball | batsman | non_striker | bowler | is_super_over | wide_runs | bye_runs | legbye_runs | noball_runs | penalty_runs | batsman_runs | extra_runs | total_runs | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 150455 | 636 | 2 | RCB | SH | 20 | 2 | Sachin Baby | CJ Jordan | B Kumar | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 2 |
| 150456 | 636 | 2 | RCB | SH | 20 | 3 | Sachin Baby | CJ Jordan | B Kumar | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 150457 | 636 | 2 | RCB | SH | 20 | 4 | Iqbal Abdulla | Sachin Baby | B Kumar | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 1 |
| 150458 | 636 | 2 | RCB | SH | 20 | 5 | Sachin Baby | Iqbal Abdulla | B Kumar | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 |
| 150459 | 636 | 2 | RCB | SH | 20 | 6 | Iqbal Abdulla | Sachin Baby | B Kumar | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0 | 4 |
In [238]:
dd['team1'] = dd['team1'].map({'Chennai Super Kings':'CSK',
'Deccan Chargers':'DC',
'Delhi Daredevils':'DD',
'Gujarat Lions':'GL',
'Kings XI Punjab':'KXIP',
'Kochi Tuskers Kerala':'KTK',
'Kolkata Knight Riders':'KKR',
'Mumbai Indians':'MI',
'Pune Warriors':'PW',
'Rajasthan Royals':'RR',
'Rising Pune Supergiant':'RPS',
'Rising Pune Supergiants':'RPSS',
'Royal Challengers Bangalore':'RCB',
'Sunrisers Hyderabad':'SH'})
dd['team1']
Out[238]:
0 SH
1 MI
2 GL
3 RPS
4 RCB
...
631 DD
632 GL
633 SH
634 GL
635 SH
Name: team1, Length: 636, dtype: object
In [239]:
dd['team2'] = dd['team2'].map({'Chennai Super Kings':'CSK',
'Deccan Chargers':'DC',
'Delhi Daredevils':'DD',
'Gujarat Lions':'GL',
'Kings XI Punjab':'KXIP',
'Kochi Tuskers Kerala':'KTK',
'Kolkata Knight Riders':'KKR',
'Mumbai Indians':'MI',
'Pune Warriors':'PW',
'Rajasthan Royals':'RR',
'Rising Pune Supergiant':'RPS',
'Rising Pune Supergiants':'RPSS',
'Royal Challengers Bangalore':'RCB',
'Sunrisers Hyderabad':'SH'})
dd['team2']
Out[239]:
0 RCB
1 RPS
2 KKR
3 KXIP
4 DD
...
631 RCB
632 RCB
633 KKR
634 SH
635 RCB
Name: team2, Length: 636, dtype: object
In [240]:
dd['toss_winner'] = dd['toss_winner'].map({'Chennai Super Kings':'CSK',
'Deccan Chargers':'DC',
'Delhi Daredevils':'DD',
'Gujarat Lions':'GL',
'Kings XI Punjab':'KXIP',
'Kochi Tuskers Kerala':'KTK',
'Kolkata Knight Riders':'KKR',
'Mumbai Indians':'MI',
'Pune Warriors':'PW',
'Rajasthan Royals':'RR',
'Rising Pune Supergiant':'RPS',
'Rising Pune Supergiants':'RPSS',
'Royal Challengers Bangalore':'RCB',
'Sunrisers Hyderabad':'SH'})
dd['toss_winner']
Out[240]:
0 RCB
1 RPS
2 KKR
3 KXIP
4 RCB
...
631 RCB
632 RCB
633 KKR
634 SH
635 SH
Name: toss_winner, Length: 636, dtype: object
In [241]:
dd['winner'] = dd['winner'].map({'Chennai Super Kings':'CSK',
'Deccan Chargers':'DC',
'Delhi Daredevils':'DD',
'Gujarat Lions':'GL',
'Kings XI Punjab':'KXIP',
'Kochi Tuskers Kerala':'KTK',
'Kolkata Knight Riders':'KKR',
'Mumbai Indians':'MI',
'Pune Warriors':'PW',
'Rajasthan Royals':'RR',
'Rising Pune Supergiant':'RPS',
'Rising Pune Supergiants':'RPSS',
'Royal Challengers Bangalore':'RCB',
'Sunrisers Hyderabad':'SH'})
dd['winner']
Out[241]:
0 SH
1 RPS
2 KKR
3 KXIP
4 RCB
...
631 RCB
632 RCB
633 SH
634 SH
635 SH
Name: winner, Length: 636, dtype: object
In [242]:
dd.head()
Out[242]:
| season | city | team1 | team2 | toss_winner | toss_decision | result | winner | win_by_runs | win_by_wickets | player_of_match | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2017 | Hyderabad | SH | RCB | RCB | field | normal | SH | 35 | 0 | Yuvraj Singh |
| 1 | 2017 | Pune | MI | RPS | RPS | field | normal | RPS | 0 | 7 | SPD Smith |
| 2 | 2017 | Rajkot | GL | KKR | KKR | field | normal | KKR | 0 | 10 | CA Lynn |
| 3 | 2017 | Indore | RPS | KXIP | KXIP | field | normal | KXIP | 0 | 6 | GJ Maxwell |
| 4 | 2017 | Bangalore | RCB | DD | RCB | bat | normal | RCB | 15 | 0 | KM Jadhav |
In [216]:
dd.tail()
Out[216]:
| season | city | team1 | team2 | toss_winner | toss_decision | result | winner | win_by_runs | win_by_wickets | player_of_match | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 631 | 2016 | Raipur | DD | RCB | RCB | field | normal | RCB | 0 | 6 | V Kohli |
| 632 | 2016 | Bangalore | GL | RCB | RCB | field | normal | RCB | 0 | 4 | AB de Villiers |
| 633 | 2016 | Delhi | SH | KKR | KKR | field | normal | SH | 22 | 0 | MC Henriques |
| 634 | 2016 | Delhi | GL | SH | SH | field | normal | SH | 0 | 4 | DA Warner |
| 635 | 2016 | Bangalore | SH | RCB | SH | bat | normal | SH | 8 | 0 | BCJ Cutting |
Get basic statistical details¶
In [143]:
cd.describe()
Out[143]:
| match_id | inning | over | ball | is_super_over | wide_runs | bye_runs | legbye_runs | noball_runs | penalty_runs | batsman_runs | extra_runs | total_runs | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 150460.000000 | 150460.000000 | 150460.000000 | 150460.000000 | 150460.000000 | 150460.000000 | 150460.000000 | 150460.000000 | 150460.000000 | 150460.000000 | 150460.000000 | 150460.000000 | 150460.000000 |
| mean | 318.281317 | 1.482188 | 10.142649 | 3.616483 | 0.000538 | 0.037498 | 0.004885 | 0.022232 | 0.004340 | 0.000066 | 1.222445 | 0.069022 | 1.291466 |
| std | 182.955531 | 0.501768 | 5.674338 | 1.807698 | 0.023196 | 0.257398 | 0.114234 | 0.200104 | 0.072652 | 0.018229 | 1.594509 | 0.349667 | 1.583240 |
| min | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 161.000000 | 1.000000 | 5.000000 | 2.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 50% | 319.000000 | 1.000000 | 10.000000 | 4.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 1.000000 |
| 75% | 476.000000 | 2.000000 | 15.000000 | 5.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 1.000000 |
| max | 636.000000 | 4.000000 | 20.000000 | 9.000000 | 1.000000 | 5.000000 | 4.000000 | 5.000000 | 5.000000 | 5.000000 | 6.000000 | 7.000000 | 7.000000 |
In [144]:
dd.describe()
Out[144]:
| season | win_by_runs | win_by_wickets | |
|---|---|---|---|
| count | 636.000000 | 636.000000 | 636.000000 |
| mean | 2012.490566 | 13.682390 | 3.372642 |
| std | 2.773026 | 23.908877 | 3.420338 |
| min | 2008.000000 | 0.000000 | 0.000000 |
| 25% | 2010.000000 | 0.000000 | 0.000000 |
| 50% | 2012.000000 | 0.000000 | 4.000000 |
| 75% | 2015.000000 | 20.000000 | 7.000000 |
| max | 2017.000000 | 146.000000 | 10.000000 |
In [146]:
cd.boxplot(column =["inning","over","ball","is_super_over","wide_runs","bye_runs","legbye_runs","noball_runs","penalty_runs","batsman_runs","extra_runs","total_runs"])
plt.xticks(rotation=90)
plt.title('Finding any outliners')
plt.show()
In [147]:
cd['total_runs'].describe()
Out[147]:
count 150460.000000 mean 1.291466 std 1.583240 min 0.000000 25% 0.000000 50% 1.000000 75% 1.000000 max 7.000000 Name: total_runs, dtype: float64
In [148]:
import numpy as np
Q1=cd['total_runs'].quantile(0.25)
Q3=cd['total_runs'].quantile(0.75)
IQR=Q3-Q1
lower_bound = Q1 - 1.5 *IQR
Upper_bound = Q3 + 1.5 *IQR
print(f"Lower Bound: {lower_bound}")
print(f"Upper Bound: {Upper_bound}")
median_value = cd['total_runs'].median()
print(f"Median value: {median_value}")
cd['total_runs']=np.where((cd['total_runs']< lower_bound) | (cd['total_runs']> Upper_bound),median_value,cd['total_runs'])
Lower Bound: -1.5 Upper Bound: 2.5 Median value: 1.0
In [149]:
cd.boxplot(column = ['total_runs'])
plt.title('Total_runs')
plt.show()
In [150]:
cd.shape
Out[150]:
(150460, 18)
In [151]:
cd['batsman_runs'].describe()
Out[151]:
count 150460.000000 mean 1.222445 std 1.594509 min 0.000000 25% 0.000000 50% 1.000000 75% 1.000000 max 6.000000 Name: batsman_runs, dtype: float64
In [152]:
import numpy as np
Q1=cd['batsman_runs'].quantile(0.25)
Q3=cd['batsman_runs'].quantile(0.75)
IQR=Q3-Q1
lower_bound = Q1 - 1.5 *IQR
Upper_bound = Q3 + 1.5 *IQR
print(f"Lower Bound: {lower_bound}")
print(f"Upper Bound: {Upper_bound}")
median_value = cd['batsman_runs'].median()
print(f"Median value: {median_value}")
cd['batsman_runs']=np.where((cd['batsman_runs']< lower_bound) | (cd['batsman_runs']> Upper_bound),median_value,cd['batsman_runs'])
Lower Bound: -1.5 Upper Bound: 2.5 Median value: 1.0
In [153]:
cd.boxplot(column = ['batsman_runs'])
plt.title('Batsman_runs')
plt.show()
In [154]:
cd.shape
Out[154]:
(150460, 18)
In [155]:
cd['inning'].describe()
Out[155]:
count 150460.000000 mean 1.482188 std 0.501768 min 1.000000 25% 1.000000 50% 1.000000 75% 2.000000 max 4.000000 Name: inning, dtype: float64
In [156]:
import numpy as np
Q1=cd['inning'].quantile(0.25)
Q3=cd['inning'].quantile(0.75)
IQR=Q3-Q1
lower_bound = Q1 - 1.5 *IQR
Upper_bound = Q3 + 1.5 *IQR
print(f"Lower Bound: {lower_bound}")
print(f"Upper Bound: {Upper_bound}")
median_value = cd['inning'].median()
print(f"Median value: {median_value}")
cd['inning']=np.where((cd['inning']< lower_bound) | (cd['inning']> Upper_bound),median_value,cd['inning'])
Lower Bound: -0.5 Upper Bound: 3.5 Median value: 1.0
In [157]:
cd.boxplot(column = ['inning'])
plt.title('Innings')
plt.show()
In [158]:
cd.shape
Out[158]:
(150460, 18)
In [159]:
dd.boxplot(column = ['win_by_runs','win_by_wickets'])
plt.title('Finding any outliners')
plt.show()
In [160]:
import numpy as np
Q1=dd['win_by_runs'].quantile(0.25)
Q3=dd['win_by_runs'].quantile(0.75)
IQR=Q3-Q1
lower_bound = Q1 - 1.5 *IQR
Upper_bound = Q3 + 1.5 *IQR
print(f"Lower Bound: {lower_bound}")
print(f"Upper Bound: {Upper_bound}")
median_value = dd['win_by_runs'].median()
print(f"Median value: {median_value}")
dd['win_by_runs']=np.where((dd['win_by_runs']< lower_bound) | (dd['win_by_runs']> Upper_bound),median_value,dd['win_by_runs'])
Lower Bound: -30.0 Upper Bound: 50.0 Median value: 0.0
In [161]:
dd.boxplot(column = ['win_by_runs'])
plt.title('Finding any outliners')
plt.show()
In [162]:
dd.shape
Out[162]:
(636, 11)
In [163]:
cd['match_id'].unique()
Out[163]:
array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52,
53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65,
66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78,
79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91,
92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104,
105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117,
118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130,
131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156,
157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182,
183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195,
196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208,
209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221,
222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234,
235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247,
248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260,
261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273,
274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286,
287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299,
300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312,
313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325,
326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338,
339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351,
352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364,
365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377,
378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390,
391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403,
404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416,
417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429,
430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442,
443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455,
456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468,
469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481,
482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494,
495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507,
508, 509, 510, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520,
521, 522, 523, 524, 525, 526, 527, 528, 529, 530, 531, 532, 533,
534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545, 546,
547, 548, 549, 550, 551, 552, 553, 554, 555, 556, 557, 558, 559,
560, 561, 562, 563, 564, 565, 566, 567, 568, 569, 570, 571, 572,
573, 574, 575, 576, 577, 578, 579, 580, 581, 582, 583, 584, 585,
586, 587, 588, 589, 590, 591, 592, 593, 594, 595, 596, 597, 598,
599, 600, 601, 602, 603, 604, 605, 606, 607, 608, 609, 610, 611,
612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623, 624,
625, 626, 627, 628, 629, 630, 631, 632, 633, 634, 635, 636],
dtype=int64)
In [164]:
match_1 = cd[cd['match_id']==1]
In [165]:
match_1.head()
Out[165]:
| match_id | inning | batting_team | bowling_team | over | ball | batsman | non_striker | bowler | is_super_over | wide_runs | bye_runs | legbye_runs | noball_runs | penalty_runs | batsman_runs | extra_runs | total_runs | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 1.0 | SH | RCB | 1 | 1 | DA Warner | S Dhawan | TS Mills | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0 | 0.0 |
| 1 | 1 | 1.0 | SH | RCB | 1 | 2 | DA Warner | S Dhawan | TS Mills | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0 | 0.0 |
| 2 | 1 | 1.0 | SH | RCB | 1 | 3 | DA Warner | S Dhawan | TS Mills | 0 | 0 | 0 | 0 | 0 | 0 | 1.0 | 0 | 1.0 |
| 3 | 1 | 1.0 | SH | RCB | 1 | 4 | DA Warner | S Dhawan | TS Mills | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0 | 0.0 |
| 4 | 1 | 1.0 | SH | RCB | 1 | 5 | DA Warner | S Dhawan | TS Mills | 0 | 2 | 0 | 0 | 0 | 0 | 0.0 | 2 | 2.0 |
In [166]:
match_1.shape
Out[166]:
(248, 18)
In [167]:
srh=match_1[match_1['inning']==1]
In [168]:
srh['batsman_runs'].value_counts()
Out[168]:
batsman_runs 1.0 84 0.0 32 2.0 9 Name: count, dtype: int64
In [169]:
rcb=match_1[match_1['inning']==2]
In [170]:
rcb['batsman_runs'].value_counts()
Out[170]:
batsman_runs 1.0 67 0.0 49 2.0 7 Name: count, dtype: int64
Visuallization¶
In [171]:
fig = pit.pie(data_frame=cd,names='over')
fig.show() # Increase to decrease
In [73]:
fig = pit.pie(data_frame=cd,names='ball')
plt.title('Distribution of ball')
fig.show()
In [74]:
fig = pit.pie(data_frame=cd,names='wide_runs')
fig.show()
In [75]:
fig = pit.pie(data_frame=cd,names='bye_runs')
fig.show()
In [76]:
fig = pit.pie(data_frame=cd,names='legbye_runs')
fig.show()
In [77]:
fig = pit.pie(data_frame=cd,names='noball_runs')
fig.show()
In [78]:
fig = pit.pie(data_frame=cd,names='penalty_runs')
fig.show()
In [79]:
fig = pit.pie(data_frame=cd,names='extra_runs')
fig.show()
In [80]:
fig = pit.pie(data_frame=cd,names='total_runs')
fig.show()
In [81]:
cd['is_super_over']=cd['is_super_over'].map({0:'No super over',1:'Super over'})
cd
Out[81]:
| match_id | inning | batting_team | bowling_team | over | ball | batsman | non_striker | bowler | is_super_over | wide_runs | bye_runs | legbye_runs | noball_runs | penalty_runs | batsman_runs | extra_runs | total_runs | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 1.0 | SH | RCB | 1 | 1 | DA Warner | S Dhawan | TS Mills | No super over | 0 | 0 | 0 | 0 | 0 | 0.0 | 0 | 0.0 |
| 1 | 1 | 1.0 | SH | RCB | 1 | 2 | DA Warner | S Dhawan | TS Mills | No super over | 0 | 0 | 0 | 0 | 0 | 0.0 | 0 | 0.0 |
| 2 | 1 | 1.0 | SH | RCB | 1 | 3 | DA Warner | S Dhawan | TS Mills | No super over | 0 | 0 | 0 | 0 | 0 | 1.0 | 0 | 1.0 |
| 3 | 1 | 1.0 | SH | RCB | 1 | 4 | DA Warner | S Dhawan | TS Mills | No super over | 0 | 0 | 0 | 0 | 0 | 0.0 | 0 | 0.0 |
| 4 | 1 | 1.0 | SH | RCB | 1 | 5 | DA Warner | S Dhawan | TS Mills | No super over | 2 | 0 | 0 | 0 | 0 | 0.0 | 2 | 2.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 150455 | 636 | 2.0 | RCB | SH | 20 | 2 | Sachin Baby | CJ Jordan | B Kumar | No super over | 0 | 0 | 0 | 0 | 0 | 2.0 | 0 | 2.0 |
| 150456 | 636 | 2.0 | RCB | SH | 20 | 3 | Sachin Baby | CJ Jordan | B Kumar | No super over | 0 | 0 | 0 | 0 | 0 | 0.0 | 0 | 0.0 |
| 150457 | 636 | 2.0 | RCB | SH | 20 | 4 | Iqbal Abdulla | Sachin Baby | B Kumar | No super over | 0 | 0 | 1 | 0 | 0 | 0.0 | 1 | 1.0 |
| 150458 | 636 | 2.0 | RCB | SH | 20 | 5 | Sachin Baby | Iqbal Abdulla | B Kumar | No super over | 0 | 0 | 0 | 0 | 0 | 1.0 | 0 | 1.0 |
| 150459 | 636 | 2.0 | RCB | SH | 20 | 6 | Iqbal Abdulla | Sachin Baby | B Kumar | No super over | 0 | 0 | 0 | 0 | 0 | 1.0 | 0 | 1.0 |
150460 rows × 18 columns
In [83]:
plt.figure(figsize=(5,4))
sns.countplot(x='is_super_over',data=cd,palette=['Red','black'])
plt.title('Count plot')
plt.show()
# 150000 of instances or counts of match happened
SO THIS WE CAN SAY THERE IS NO SUPER OVER IN THE MATCH¶
In [85]:
plt.figure(figsize=(5,6))
sns.countplot(x='batting_team',data=cd)
plt.xticks(rotation=90)
plt.title('Count plot')
plt.show()
In [86]:
cd['batting_team'].value_counts()
Out[86]:
batting_team MI 18943 RCB 17678 KXIP 17594 KKR 17229 DD 17185 CSK 15754 RR 13914 SH 9058 DC 9034 PW 5443 GL 3566 RPS 1900 KTK 1582 RPSS 1580 Name: count, dtype: int64
In [215]:
plt.figure(figsize=(5,6))
sns.countplot(x='bowling_team',data=cd)
plt.xticks(rotation=90)
plt.title('Count plot')
plt.show()
In [84]:
cd['bowling_team'].value_counts()
Out[84]:
bowling_team MI 18879 RCB 17920 KKR 17411 KXIP 17392 DD 17099 CSK 15562 RR 14111 DC 9039 SH 8888 PW 5457 GL 3545 RPS 1928 RPSS 1615 KTK 1614 Name: count, dtype: int64
In [87]:
plt.figure(figsize = (10,5))
sns.barplot(x='batting_team',y ='total_runs', data = cd)
plt.xticks(rotation = 90)
plt.show()
THIS VISUALIZATION SHOW THE PLAYER WHO HAVE WON THE MOST PLAYER OF MATCH (SEASON (2008-2017))¶
In [88]:
dd['season'].value_counts()
Out[88]:
season 2013 76 2012 74 2011 73 2010 60 2014 60 2016 60 2017 59 2015 59 2008 58 2009 57 Name: count, dtype: int64
In [90]:
import plotly.express as px
season_counts = dd['season'].value_counts().reset_index()
season_counts.columns = ['season', 'count']
# Sort by season (not strictly necessary if data is already sorted)
season_counts = season_counts.sort_values('season')
# Plotting with Plotly
fig = px.bar(season_counts,
x='season',
y='count',
title='Number of matches played in each IPL season',
labels={'season': 'Season', 'count': 'No. of matches'},
template='plotly_dark', # Example of using a dark theme
color='season') # Optional: Color bars by season
fig.update_layout(xaxis={'categoryorder':'category ascending'}) # Ensure x-axis is sorted
fig.show()
The IPL 2013 season had the highest number of matches played¶
In [93]:
plt.subplots(figsize=(10,6))
dd['toss_winner'].value_counts().plot.bar(width=0.8)
plt.show()
In [94]:
dd['toss_winner'].value_counts()
Out[94]:
toss_winner MI 85 KKR 78 DD 72 RCB 70 KXIP 68 CSK 66 RR 63 DC 43 SH 35 PW 20 GL 15 KTK 8 RPSS 7 RPS 6 Name: count, dtype: int64
CHOICE OF TOSS IN DIFFERENT SEASONS¶
In [95]:
plt.subplots()
sns.countplot(x="season",hue="toss_decision",data=dd,palette=['Red','Green'])
plt.show()
Starting from the IPL-2016 season, there is a noticeable trend favoring the decision to field¶
In [96]:
plt.subplots(figsize=(10,6))
dd['winner'].value_counts().plot.bar(width=0.8)
plt.show()
In [97]:
dd['winner'].value_counts()
Out[97]:
winner ML 92 CSK 79 KKR 78 RCB 73 KXIP 70 RR 64 DD 62 SH 42 DC 30 GL 13 PW 12 RPS 10 KTK 6 RPSS 5 Name: count, dtype: int64
INFERENCE¶
IPL T20 MATCH IN 2008 TO 2017¶
MAJOR INDICENTS¶
In July 2015, the Supreme Court of India set up a three-member committee led by former Chief Justice R M Lodha to investigate. The Justice Lodha Committee found evidence of match-fixing and betting, leading to a two-year ban for both Chennai Super Kings and Rajasthan Royals from the IPL in 2016 and 2017.¶
If this circumstances not happened, CSK may be the winner of match is most frequency¶
In [106]:
plt.figure(figsize=(5,4))
sns.countplot(x='player_of_match',data=dd)
plt.title('Most frequented')
plt.xticks(rotation=90)
plt.show()
In [107]:
dd['player_of_match'].value_counts()
Out[107]:
player_of_match
CH Gayle 18
YK Pathan 16
DA Warner 15
AB de Villiers 15
RG Sharma 14
..
AD Mathews 1
LR Shukla 1
R Bhatia 1
A Singh 1
BCJ Cutting 1
Name: count, Length: 201, dtype: int64
TAKEN TOP 5 PLAYERS¶
In [108]:
dd['player_of_match'].value_counts()[0:5]
Out[108]:
player_of_match CH Gayle 18 YK Pathan 16 DA Warner 15 AB de Villiers 15 RG Sharma 14 Name: count, dtype: int64
Chris Gayle has received the highest number of Man of the Match awards¶
Finding out the number of wins each team after batting first¶
In [109]:
batting_first=dd[dd['win_by_runs']!=0]
In [177]:
batting_first.head()
Out[177]:
| season | city | team1 | team2 | toss_winner | toss_decision | result | winner | win_by_runs | win_by_wickets | player_of_match | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2017 | Hyderabad | SH | RCB | RCB | field | normal | SH | 35.0 | 0 | Yuvraj Singh |
| 4 | 2017 | Bangalore | RCB | DD | RCB | bat | normal | RCB | 15.0 | 0 | KM Jadhav |
| 13 | 2017 | Kolkata | KKR | SH | SH | field | normal | KKR | 17.0 | 0 | RV Uthappa |
| 16 | 2017 | Bangalore | RPS | RCB | RCB | field | normal | RPS | 27.0 | 0 | BA Stokes |
| 18 | 2017 | Hyderabad | SH | KXIP | KXIP | field | normal | SH | 5.0 | 0 | B Kumar |
In [181]:
batting_first['win_by_runs'].shape
Out[181]:
(240,)
In [182]:
batting_first['win_by_runs'].unique()
Out[182]:
array([35., 15., 17., 27., 5., 21., 14., 26., 3., 48., 19., 12., 7.,
9., 10., 20., 1., 33., 6., 13., 45., 29., 18., 23., 41., 25.,
11., 24., 38., 8., 16., 2., 4., 31., 34., 36., 39., 40., 37.,
22., 32., 43., 28., 42., 46., 47., 44., 30., 50.])
COMPARING 3 COLUMNS ARE TEAM 1 , TEAM 2 , WINNER TEAM AND WIN_BY_RUNS¶
In [110]:
batting_first.head()
Out[110]:
| season | city | team1 | team2 | toss_winner | toss_decision | result | winner | win_by_runs | win_by_wickets | player_of_match | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2017 | Hyderabad | SH | RCB | RCB | field | normal | SH | 35.0 | 0 | Yuvraj Singh |
| 4 | 2017 | Bangalore | RCB | DD | RCB | bat | normal | RCB | 15.0 | 0 | KM Jadhav |
| 13 | 2017 | Kolkata | KKR | SH | SH | field | normal | KKR | 17.0 | 0 | RV Uthappa |
| 16 | 2017 | Bangalore | RPS | RCB | RCB | field | normal | RPS | 27.0 | 0 | BA Stokes |
| 18 | 2017 | Hyderabad | SH | KXIP | KXIP | field | normal | SH | 5.0 | 0 | B Kumar |
In [184]:
#MAKING A HISTROGRAM
plt.figure(figsize=(7,7))
plt.hist(batting_first['win_by_runs'])
plt.title('Distribution of runs')
plt.xlabel('Runs')
plt.show() # x=win_by_runs and y=counts as happens
COMPARING FIRST BATTING TEAM HAS CHOSEN, AND THE WINNER TEAM¶
In [172]:
plt.figure(figsize=(7,7))
plt.bar(list(batting_first['winner'].value_counts()[0:3].keys()),list(batting_first['winner'].value_counts()[0:3]),color=['blue','yellow','orange'])
plt.show()
In [173]:
plt.figure(figsize=(7,7))
plt.pie(list(batting_first['winner'].value_counts()),labels=list(batting_first['winner'].value_counts().keys()))
plt.show()
Finding out the number of wins each team after batting second¶
In [174]:
batting_second = dd[dd['win_by_wickets']!=0]
In [175]:
batting_second.head()
Out[175]:
| season | city | team1 | team2 | toss_winner | toss_decision | result | winner | win_by_runs | win_by_wickets | player_of_match | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 2017 | Pune | MI | RPS | RPS | field | normal | RPS | 0.0 | 7 | SPD Smith |
| 2 | 2017 | Rajkot | GL | KKR | KKR | field | normal | KKR | 0.0 | 10 | CA Lynn |
| 3 | 2017 | Indore | RPS | KXIP | KXIP | field | normal | KXIP | 0.0 | 6 | GJ Maxwell |
| 5 | 2017 | Hyderabad | GL | SH | SH | field | normal | SH | 0.0 | 9 | Rashid Khan |
| 6 | 2017 | Mumbai | KKR | MI | MI | field | normal | M | 0.0 | 4 | N Rana |
In [176]:
batting_second['win_by_wickets'].unique()
Out[176]:
array([ 7, 10, 6, 9, 4, 8, 5, 2, 3, 1], dtype=int64)
In [13]:
plt.figure(figsize=(7,7))
plt.hist(batting_second['win_by_wickets'],bins=20)
plt.show()
we have a interesting histrogram here, 70 instances or matches were the batting second has 1 with 7 wickets in hand¶
It seems that if team batting second as won the match then most probabile it hood not have lost or lot of wickets¶
IT SEEMS LIKE NORMAL DISTRIBUTION , BUT IT IS DISCRETE¶
EXACTLY 10 MATCHES OF 1 WITH 10 WICKETS REMAINING, WHERE THE TEAM BATTING SECOND HAS NOT EVEN LOST A SINGLE WICKET GONE TO DISMENT THEIR OPPENENT¶
Finding out the number of wins each team after batting second¶
In [243]:
plt.figure(figsize=(7,7))
plt.bar(list(batting_second['winner'].value_counts()[0:3].keys()),list(batting_second['winner'].value_counts()[0:3]),color=['purple','blue','red'])
plt.show()
In [244]:
batting_second['winner'].value_counts()
Out[244]:
winner KKR 46 M 44 RCB 42 DD 41 RR 38 KXIP 36 CSK 33 SH 18 GL 12 DC 11 PW 6 RPS 5 KTK 4 RPSS 3 Name: count, dtype: int64
In [245]:
plt.figure(figsize=(7,7))
plt.pie(list(batting_second['winner'].value_counts()),labels=list(batting_second['winner'].value_counts().keys()))
plt.show()
In [247]:
#Finding out how many times team has won the match after winning the toss
num.sum(dd['toss_winner']==dd['winner'])
Out[247]:
325
In [248]:
325/636
Out[248]:
0.5110062893081762
51% of the times a team winning the toss also win the match so, it actually means that not much of effect on toss.¶
its more like a equally(50-50) neither its heads or tails so toss may be afftect the match that much¶
HYPOTHESIS TEST¶
CHI SQUARE TESTING¶
In [249]:
from scipy.stats import chi2_contingency
contingency_table = pd.crosstab(dd["city"],dd["winner"])
chi2_result = chi2_contingency(contingency_table,correction = False)
print("chi square result:",chi2_result)
chi square result: Chi2ContingencyResult(statistic=1439.865131584083, pvalue=8.258671120736886e-124, dof=377, expected_freq=array([[ 0.86102236, 0.32428115, 0.68210863, 0.14536741, 0.86102236,
0.06709265, 0.7715655 , 1.02875399, 0.1341853 , 0.80511182,
0.11182109, 0.05591054, 0.70447284, 0.44728435],
[ 1.47603834, 0.55591054, 1.16932907, 0.24920128, 1.47603834,
0.11501597, 1.32268371, 1.76357827, 0.23003195, 1.38019169,
0.19169329, 0.09584665, 1.20766773, 0.76677316],
[ 7.87220447, 2.96485623, 6.23642173, 1.32907348, 7.87220447,
0.61341853, 7.0543131 , 9.4057508 , 1.22683706, 7.36102236,
1.02236422, 0.51118211, 6.44089457, 4.08945687],
[ 0.24600639, 0.09265176, 0.19488818, 0.04153355, 0.24600639,
0.01916933, 0.22044728, 0.29392971, 0.03833866, 0.23003195,
0.03194888, 0.01597444, 0.20127796, 0.12779553],
[ 0.86102236, 0.32428115, 0.68210863, 0.14536741, 0.86102236,
0.06709265, 0.7715655 , 1.02875399, 0.1341853 , 0.80511182,
0.11182109, 0.05591054, 0.70447284, 0.44728435],
[ 1.47603834, 0.55591054, 1.16932907, 0.24920128, 1.47603834,
0.11501597, 1.32268371, 1.76357827, 0.23003195, 1.38019169,
0.19169329, 0.09584665, 1.20766773, 0.76677316],
[ 5.65814696, 2.13099042, 4.48242812, 0.95527157, 5.65814696,
0.44089457, 5.07028754, 6.76038339, 0.88178914, 5.29073482,
0.73482428, 0.36741214, 4.62939297, 2.93929712],
[ 5.90415335, 2.22364217, 4.67731629, 0.99680511, 5.90415335,
0.4600639 , 5.29073482, 7.0543131 , 0.9201278 , 5.52076677,
0.76677316, 0.38338658, 4.83067093, 3.06709265],
[ 0.86102236, 0.32428115, 0.68210863, 0.14536741, 0.86102236,
0.06709265, 0.7715655 , 1.02875399, 0.1341853 , 0.80511182,
0.11182109, 0.05591054, 0.70447284, 0.44728435],
[ 7.2571885 , 2.73322684, 5.74920128, 1.22523962, 7.2571885 ,
0.56549521, 6.50319489, 8.67092652, 1.13099042, 6.78594249,
0.94249201, 0.47124601, 5.93769968, 3.76996805],
[ 1.10702875, 0.41693291, 0.87699681, 0.18690096, 1.10702875,
0.08626198, 0.99201278, 1.32268371, 0.17252396, 1.03514377,
0.14376997, 0.07188498, 0.9057508 , 0.57507987],
[ 1.84504792, 0.69488818, 1.46166134, 0.3115016 , 1.84504792,
0.14376997, 1.65335463, 2.20447284, 0.28753994, 1.72523962,
0.23961661, 0.11980831, 1.50958466, 0.95846645],
[ 0.36900958, 0.13897764, 0.29233227, 0.06230032, 0.36900958,
0.02875399, 0.33067093, 0.44089457, 0.05750799, 0.34504792,
0.04792332, 0.02396166, 0.30191693, 0.19169329],
[ 6.02715655, 2.26996805, 4.77476038, 1.01757188, 6.02715655,
0.46964856, 5.40095847, 7.20127796, 0.93929712, 5.63578275,
0.7827476 , 0.3913738 , 4.9313099 , 3.13099042],
[ 0.61501597, 0.23162939, 0.48722045, 0.10383387, 0.61501597,
0.04792332, 0.55111821, 0.73482428, 0.09584665, 0.57507987,
0.0798722 , 0.0399361 , 0.50319489, 0.31948882],
[ 4.05910543, 1.52875399, 3.21565495, 0.68530351, 4.05910543,
0.31629393, 3.63738019, 4.84984026, 0.63258786, 3.79552716,
0.52715655, 0.26357827, 3.32108626, 2.1086262 ],
[ 0.98402556, 0.37060703, 0.77955272, 0.16613419, 0.98402556,
0.07667732, 0.88178914, 1.17571885, 0.15335463, 0.9201278 ,
0.12779553, 0.06389776, 0.80511182, 0.51118211],
[ 0.49201278, 0.18530351, 0.38977636, 0.08306709, 0.49201278,
0.03833866, 0.44089457, 0.58785942, 0.07667732, 0.4600639 ,
0.06389776, 0.03194888, 0.40255591, 0.25559105],
[ 0.36900958, 0.13897764, 0.29233227, 0.06230032, 0.36900958,
0.02875399, 0.33067093, 0.44089457, 0.05750799, 0.34504792,
0.04792332, 0.02396166, 0.30191693, 0.19169329],
[ 0.61501597, 0.23162939, 0.48722045, 0.10383387, 0.61501597,
0.04792332, 0.55111821, 0.73482428, 0.09584665, 0.57507987,
0.0798722 , 0.0399361 , 0.50319489, 0.31948882],
[ 7.50319489, 2.82587859, 5.94408946, 1.26677316, 7.50319489,
0.58466454, 6.72364217, 8.96485623, 1.16932907, 7.01597444,
0.97444089, 0.48722045, 6.13897764, 3.89776358],
[10.45527157, 3.93769968, 8.2827476 , 1.76517572, 10.45527157,
0.81469649, 9.36900958, 12.49201278, 1.62939297, 9.77635783,
1.35782748, 0.67891374, 8.5543131 , 5.4313099 ],
[ 0.36900958, 0.13897764, 0.29233227, 0.06230032, 0.36900958,
0.02875399, 0.33067093, 0.44089457, 0.05750799, 0.34504792,
0.04792332, 0.02396166, 0.30191693, 0.19169329],
[ 0.86102236, 0.32428115, 0.68210863, 0.14536741, 0.86102236,
0.06709265, 0.7715655 , 1.02875399, 0.1341853 , 0.80511182,
0.11182109, 0.05591054, 0.70447284, 0.44728435],
[ 3.93610224, 1.48242812, 3.11821086, 0.66453674, 3.93610224,
0.30670927, 3.52715655, 4.7028754 , 0.61341853, 3.68051118,
0.51118211, 0.25559105, 3.22044728, 2.04472843],
[ 0.73801917, 0.27795527, 0.58466454, 0.12460064, 0.73801917,
0.05750799, 0.66134185, 0.88178914, 0.11501597, 0.69009585,
0.09584665, 0.04792332, 0.60383387, 0.38338658],
[ 1.23003195, 0.46325879, 0.97444089, 0.20766773, 1.23003195,
0.09584665, 1.10223642, 1.46964856, 0.19169329, 1.15015974,
0.15974441, 0.0798722 , 1.00638978, 0.63897764],
[ 0.86102236, 0.32428115, 0.68210863, 0.14536741, 0.86102236,
0.06709265, 0.7715655 , 1.02875399, 0.1341853 , 0.80511182,
0.11182109, 0.05591054, 0.70447284, 0.44728435],
[ 0.73801917, 0.27795527, 0.58466454, 0.12460064, 0.73801917,
0.05750799, 0.66134185, 0.88178914, 0.11501597, 0.69009585,
0.09584665, 0.04792332, 0.60383387, 0.38338658],
[ 1.35303514, 0.50958466, 1.07188498, 0.2284345 , 1.35303514,
0.10543131, 1.21246006, 1.61661342, 0.21086262, 1.26517572,
0.17571885, 0.08785942, 1.10702875, 0.7028754 ]]))
In [250]:
p_value = 8.258671120736886e-124
alpha = 0.05
if p_value<alpha:
print("Reject the null hypothesis. There is a link")
else:
print("Fail to reject the null hypothesis")
Reject the null hypothesis. There is a link
In [ ]: